{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = pickle.load(open('waf/trained_waf_model', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'memory': None,\n",
       " 'steps': [('vectorizer',\n",
       "   TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',\n",
       "           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "           lowercase=True, max_df=1.0, max_features=None, min_df=0.0,\n",
       "           ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n",
       "           stop_words=None, strip_accents=None, sublinear_tf=True,\n",
       "           token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
       "           vocabulary=None)),\n",
       "  ('classifier', LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n",
       "             fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "             multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "             solver='liblinear', tol=0.0001, verbose=0, warm_start=False))]}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vars(p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',\n",
       "        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "        lowercase=True, max_df=1.0, max_features=None, min_df=0.0,\n",
       "        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,\n",
       "        stop_words=None, strip_accents=None, sublinear_tf=True,\n",
       "        token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b', tokenizer=None, use_idf=True,\n",
       "        vocabulary=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = p.steps[0][1]\n",
    "vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=1.0, class_weight='balanced', dual=False,\n",
       "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
       "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
       "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf = p.steps[1][1]\n",
    "clf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[  9.88191796  13.29416517  13.98731235 ...,  14.39277746  14.39277746\n",
      "  14.39277746]\n"
     ]
    }
   ],
   "source": [
    "print(vec.idf_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[  3.86345441e+00   2.97867212e-02   1.67598454e-03 ...,   5.48339628e-06\n",
      "    5.48339628e-06   5.48339628e-06]]\n"
     ]
    }
   ],
   "source": [
    "print(clf.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[  3.81783395e+01   3.95989592e-01   2.34425193e-02 ...,   7.89213024e-05\n",
      "    7.89213024e-05   7.89213024e-05]]\n"
     ]
    }
   ],
   "source": [
    "term_influence = vec.idf_ * clf.coef_\n",
    "print(term_influence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[81937 92199     2 ..., 97829 97830 97831]]\n"
     ]
    }
   ],
   "source": [
    "print(np.argpartition(term_influence, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'/': 10522,\n",
       " 'j': 59431,\n",
       " 'a': 38945,\n",
       " 'v': 86105,\n",
       " 's': 79024,\n",
       " 'c': 43527,\n",
       " 'r': 76641,\n",
       " 'i': 57223,\n",
       " 'p': 72543,\n",
       " 't': 81489,\n",
       " 'm': 65680,\n",
       " 'o': 70279,\n",
       " 'n': 67925,\n",
       " 'e': 48183,\n",
       " 'y': 92321,\n",
       " '.': 9860,\n",
       " 'x': 90144,\n",
       " '\\n': 154,\n",
       " '/j': 11969,\n",
       " 'ja': 60022,\n",
       " 'av': 41076,\n",
       " 'va': 86839,\n",
       " 'as': 40902,\n",
       " 'sc': 80190,\n",
       " 'cr': 45330,\n",
       " 'ri': 78053,\n",
       " 'ip': 58842,\n",
       " 'pt': 74414,\n",
       " 't/': 81891,\n",
       " '/m': 12101,\n",
       " 'mo': 67318,\n",
       " 'on': 71801,\n",
       " 'ne': 69117,\n",
       " 'ey': 50667,\n",
       " 'y.': 92541,\n",
       " '.e': 10250,\n",
       " 'ex': 50609,\n",
       " 'xe': 91218,\n",
       " 'e\\n': 48184,\n",
       " '/ja': 11985,\n",
       " 'jav': 60068,\n",
       " 'ava': 41103,\n",
       " 'vas': 86886,\n",
       " 'asc': 40942,\n",
       " 'scr': 80235,\n",
       " 'cri': 45365,\n",
       " 'rip': 78100,\n",
       " 'ipt': 58892,\n",
       " 'pt/': 74433,\n",
       " 't/m': 81931,\n",
       " '/mo': 12138,\n",
       " 'mon': 67355,\n",
       " 'one': 71847,\n",
       " 'ney': 69175,\n",
       " 'ey.': 50678,\n",
       " 'y.e': 92549,\n",
       " '.ex': 10263,\n",
       " 'exe': 50644,\n",
       " 'xe\\n': 91219,\n",
       " '&': 4002,\n",
       " 'h': 55125,\n",
       " ' ': 348,\n",
       " 'u': 84107,\n",
       " 'z': 94408,\n",
       " 'l': 63324,\n",
       " '$': 3487,\n",
       " '(': 5071,\n",
       " '4': 21211,\n",
       " '+': 7199,\n",
       " '9': 30764,\n",
       " ')': 5690,\n",
       " \"'\": 4414,\n",
       " '&&': 4050,\n",
       " '&e': 4195,\n",
       " 'ec': 49425,\n",
       " 'ch': 44815,\n",
       " 'ho': 56615,\n",
       " 'o ': 70289,\n",
       " ' e': 1219,\n",
       " 'xu': 92006,\n",
       " 'um': 85377,\n",
       " 'mz': 67873,\n",
       " 'zl': 95568,\n",
       " 'l$': 63411,\n",
       " '$(': 3513,\n",
       " '((': 5187,\n",
       " '(4': 5319,\n",
       " '4+': 21331,\n",
       " '+9': 7509,\n",
       " '99': 31496,\n",
       " '9)': 30864,\n",
       " '))': 5827,\n",
       " ')$': 5772,\n",
       " '(e': 5483,\n",
       " 'l)': 63496,\n",
       " ')e': 6213,\n",
       " \"l'\": 63447,\n",
       " \"'\\n\": 4417,\n",
       " '&&e': 4061,\n",
       " '&ec': 4196,\n",
       " 'ech': 49458,\n",
       " 'cho': 44859,\n",
       " 'ho ': 56617,\n",
       " 'o e': 70317,\n",
       " ' ex': 1259,\n",
       " 'exu': 50660,\n",
       " 'xum': 92036,\n",
       " 'umz': 85431,\n",
       " 'mzl': 67906,\n",
       " 'zl$': 95571,\n",
       " 'l$(': 63412,\n",
       " '$((': 3517,\n",
       " '((4': 5195,\n",
       " '(4+': 5321,\n",
       " '4+9': 21344,\n",
       " '+99': 7521,\n",
       " '99)': 31504,\n",
       " '9))': 30872,\n",
       " '))$': 5832,\n",
       " ')$(': 5773,\n",
       " '$(e': 3518,\n",
       " '(ec': 5487,\n",
       " 'zl)': 95574,\n",
       " 'l)e': 63510,\n",
       " ')ex': 6236,\n",
       " \"zl'\": 95573,\n",
       " \"l'\\n\": 63448,\n",
       " '/y': 12656,\n",
       " 't/y': 81943,\n",
       " '/y.': 12662,\n",
       " '7': 27031,\n",
       " '1': 14970,\n",
       " 'b': 41354,\n",
       " '2': 17153,\n",
       " '?': 35512,\n",
       " '<': 33627,\n",
       " '-': 8262,\n",
       " 'q': 74786,\n",
       " '=': 33946,\n",
       " 'k': 61277,\n",
       " '\"': 2477,\n",
       " 'w': 88122,\n",
       " '8': 28882,\n",
       " '>': 35044,\n",
       " '/7': 11223,\n",
       " '7u': 28641,\n",
       " 'u1': 84348,\n",
       " '1p': 16690,\n",
       " 'pb': 73496,\n",
       " 'b2': 41705,\n",
       " '2x': 19066,\n",
       " 'xi': 91415,\n",
       " 'i.': 57431,\n",
       " '.j': 10308,\n",
       " 'js': 60895,\n",
       " 'sp': 80882,\n",
       " 'p?': 73318,\n",
       " '?<': 35616,\n",
       " '<m': 33856,\n",
       " 'me': 66784,\n",
       " 'et': 50386,\n",
       " 'ta': 82704,\n",
       " 'a ': 38959,\n",
       " ' h': 1328,\n",
       " 'ht': 56876,\n",
       " 'tt': 83708,\n",
       " 'tp': 83490,\n",
       " 'p-': 72707,\n",
       " '-e': 8997,\n",
       " 'eq': 50203,\n",
       " 'qu': 76371,\n",
       " 'ui': 85174,\n",
       " 'iv': 59168,\n",
       " 'v=': 86742,\n",
       " '=s': 34888,\n",
       " 'se': 80293,\n",
       " 't-': 81820,\n",
       " '-c': 8914,\n",
       " 'co': 45180,\n",
       " 'oo': 71875,\n",
       " 'ok': 71633,\n",
       " 'ki': 62444,\n",
       " 'ie': 58240,\n",
       " 'e ': 48190,\n",
       " ' c': 1134,\n",
       " 'nt': 69906,\n",
       " 'te': 82916,\n",
       " 'en': 50035,\n",
       " 't=': 82461,\n",
       " '=\"': 33996,\n",
       " '\"t': 3104,\n",
       " 'es': 50324,\n",
       " 'st': 81110,\n",
       " 'tl': 83285,\n",
       " 'lw': 65472,\n",
       " 'ww': 89952,\n",
       " 'wz': 90093,\n",
       " 'z=': 94960,\n",
       " '=7': 34413,\n",
       " '71': 27303,\n",
       " '18': 15799,\n",
       " '8\"': 28941,\n",
       " '\">': 2873,\n",
       " '>\\n': 35049,\n",
       " '/7u': 11259,\n",
       " '7u1': 28644,\n",
       " 'u1p': 84375,\n",
       " '1pb': 16706,\n",
       " 'pb2': 73507,\n",
       " 'b2x': 41748,\n",
       " '2xi': 19090,\n",
       " 'xi.': 91423,\n",
       " 'i.j': 57444,\n",
       " '.js': 10320,\n",
       " 'jsp': 60938,\n",
       " 'sp?': 80908,\n",
       " 'p?<': 73329,\n",
       " '?<m': 35621,\n",
       " '<me': 33858,\n",
       " 'met': 66839,\n",
       " 'eta': 50423,\n",
       " 'ta ': 82706,\n",
       " 'a h': 38988,\n",
       " ' ht': 1356,\n",
       " 'htt': 56920,\n",
       " 'ttp': 83749,\n",
       " 'tp-': 83499,\n",
       " 'p-e': 72724,\n",
       " '-eq': 9029,\n",
       " 'equ': 50248,\n",
       " 'qui': 76401,\n",
       " 'uiv': 85218,\n",
       " 'iv=': 59191,\n",
       " 'v=s': 86762,\n",
       " '=se': 34898,\n",
       " 'set': 80347,\n",
       " 'et-': 50400,\n",
       " 't-c': 81836,\n",
       " '-co': 8944,\n",
       " 'coo': 45219,\n",
       " 'ook': 71918,\n",
       " 'oki': 71669,\n",
       " 'kie': 62472,\n",
       " 'ie ': 58242,\n",
       " 'e c': 48226,\n",
       " ' co': 1163,\n",
       " 'con': 45218,\n",
       " 'ont': 71862,\n",
       " 'nte': 69945,\n",
       " 'ten': 82961,\n",
       " 'ent': 50089,\n",
       " 'nt=': 69934,\n",
       " 't=\"': 82464,\n",
       " '=\"t': 34049,\n",
       " '\"te': 3109,\n",
       " 'tes': 82966,\n",
       " 'est': 50378,\n",
       " 'stl': 81159,\n",
       " 'tlw': 83327,\n",
       " 'lww': 65515,\n",
       " 'wwz': 89999,\n",
       " 'wz=': 90112,\n",
       " 'z=7': 94964,\n",
       " '=71': 34418,\n",
       " '718': 27323,\n",
       " '18\"': 15803,\n",
       " '8\">': 28944,\n",
       " '\">\\n': 2875,\n",
       " 'f': 50796,\n",
       " '3': 19195,\n",
       " '_': 37164,\n",
       " 'g': 52953,\n",
       " 'm2': 66088,\n",
       " '2f': 18326,\n",
       " 'fp': 52398,\n",
       " 'pz': 74730,\n",
       " 'zt': 95948,\n",
       " 'ty': 83960,\n",
       " '.p': 10368,\n",
       " 'ph': 73803,\n",
       " 'hp': 56669,\n",
       " 'p3': 72955,\n",
       " '3?': 20044,\n",
       " '<s': 33886,\n",
       " 't>': 82521,\n",
       " '>c': 35370,\n",
       " 'ro': 78387,\n",
       " 'os': 72103,\n",
       " 'ss': 81041,\n",
       " 's_': 80037,\n",
       " '_s': 38447,\n",
       " 'si': 80518,\n",
       " 'it': 59057,\n",
       " 'e_': 49266,\n",
       " 'ti': 83135,\n",
       " 'in': 58725,\n",
       " 'ng': 69228,\n",
       " 'g.': 53185,\n",
       " '.n': 10348,\n",
       " 'na': 68897,\n",
       " 'sl': 80673,\n",
       " 'l<': 64139,\n",
       " '</': 33698,\n",
       " '/s': 12374,\n",
       " '/m2': 12113,\n",
       " 'm2f': 66112,\n",
       " '2fp': 18362,\n",
       " 'fpz': 52448,\n",
       " 'pzt': 74770,\n",
       " 'ztt': 95986,\n",
       " 'tty': 83758,\n",
       " 'ty.': 83973,\n",
       " 'y.p': 92560,\n",
       " '.ph': 10376,\n",
       " 'php': 73840,\n",
       " 'hp3': 56684,\n",
       " 'p3?': 72973,\n",
       " '3?<': 20048,\n",
       " '?<s': 35622,\n",
       " '<sc': 33893,\n",
       " 'pt>': 74448,\n",
       " 't>c': 82551,\n",
       " '>cr': 35373,\n",
       " 'cro': 45371,\n",
       " 'ros': 78432,\n",
       " 'oss': 72150,\n",
       " 'ss_': 81077,\n",
       " 's_s': 80072,\n",
       " '_si': 38472,\n",
       " 'sit': 80561,\n",
       " 'ite': 59097,\n",
       " 'te_': 82947,\n",
       " 'e_s': 49302,\n",
       " '_sc': 38466,\n",
       " 'pti': 74464,\n",
       " 'tin': 83173,\n",
       " 'ing': 58765,\n",
       " 'ng.': 69242,\n",
       " 'g.n': 53202,\n",
       " '.na': 10349,\n",
       " 'nas': 68943,\n",
       " 'asl': 40951,\n",
       " 'sl<': 80696,\n",
       " 'l</': 64141,\n",
       " '</s': 33720,\n",
       " '/sc': 12399,\n",
       " 't>\\n': 82522,\n",
       " ';': 33026,\n",
       " ';e': 33382,\n",
       " 'o$': 70361,\n",
       " '$i': 3687,\n",
       " 'if': 58301,\n",
       " 'fs': 52547,\n",
       " 'sx': 81324,\n",
       " 'xk': 91511,\n",
       " 'kg': 62345,\n",
       " 'gk': 54324,\n",
       " 'kw': 63120,\n",
       " 'wo': 89550,\n",
       " '(8': 5374,\n",
       " '82': 29222,\n",
       " '2+': 17285,\n",
       " '+2': 7391,\n",
       " '28': 17867,\n",
       " '8)': 28977,\n",
       " 'o)': 70411,\n",
       " ')x': 6744,\n",
       " \"o'\": 70388,\n",
       " ';ec': 33386,\n",
       " 'ho$': 56619,\n",
       " 'o$i': 70363,\n",
       " '$if': 3688,\n",
       " 'ifs': 58350,\n",
       " 'fsx': 52597,\n",
       " 'sxk': 81358,\n",
       " 'xkg': 91537,\n",
       " 'kgk': 62378,\n",
       " 'gkw': 54366,\n",
       " 'kwo': 63156,\n",
       " 'wo$': 89553,\n",
       " 'o$(': 70362,\n",
       " '((8': 5199,\n",
       " '(82': 5379,\n",
       " '82+': 29229,\n",
       " '2+2': 17291,\n",
       " '+28': 7402,\n",
       " '28)': 17875,\n",
       " '8))': 28983,\n",
       " 'wo)': 89555,\n",
       " 'o)x': 70440,\n",
       " ')xk': 6758,\n",
       " \"wo'\": 89554,\n",
       " \"o'\\n\": 70389,\n",
       " '/e': 11738,\n",
       " 'xa': 91026,\n",
       " 'am': 40565,\n",
       " 'mp': 67369,\n",
       " 'pl': 73998,\n",
       " 'le': 64518,\n",
       " 's/': 79373,\n",
       " 'p/': 72783,\n",
       " '/c': 11642,\n",
       " 'he': 56128,\n",
       " 'ck': 44968,\n",
       " 'kb': 62103,\n",
       " 'bo': 42919,\n",
       " 'ox': 72377,\n",
       " 'x/': 90405,\n",
       " 'cw': 45602,\n",
       " 'wm': 89453,\n",
       " 'ma': 66576,\n",
       " 'ai': 40362,\n",
       " 'il': 58612,\n",
       " 'l.': 63613,\n",
       " '/ex': 11781,\n",
       " 'exa': 50640,\n",
       " 'xam': 91060,\n",
       " 'amp': 40609,\n",
       " 'mpl': 67409,\n",
       " 'ple': 74028,\n",
       " 'les': 64572,\n",
       " 'es/': 50340,\n",
       " 's/j': 79411,\n",
       " '/js': 12003,\n",
       " 'sp/': 80895,\n",
       " 'p/c': 72801,\n",
       " '/ch': 11673,\n",
       " 'che': 44849,\n",
       " 'hec': 56155,\n",
       " 'eck': 49461,\n",
       " 'ckb': 45004,\n",
       " 'kbo': 62138,\n",
       " 'box': 42966,\n",
       " 'ox/': 72391,\n",
       " 'x/c': 90419,\n",
       " '/cw': 11688,\n",
       " 'cwm': 45636,\n",
       " 'wma': 89474,\n",
       " 'mai': 66614,\n",
       " 'ail': 40395,\n",
       " 'il.': 58623,\n",
       " 'l.e': 63624,\n",
       " 'd': 45818,\n",
       " 'cg': 44765,\n",
       " 'gi': 54222,\n",
       " 'i-': 57389,\n",
       " '-b': 8875,\n",
       " 'bi': 42616,\n",
       " 'n/': 68251,\n",
       " '/b': 11595,\n",
       " 'bt': 43174,\n",
       " 'td': 82864,\n",
       " 'do': 47547,\n",
       " 'ow': 72317,\n",
       " 'wn': 89500,\n",
       " 'nl': 69499,\n",
       " 'lo': 65055,\n",
       " 'oa': 71111,\n",
       " 'ad': 40089,\n",
       " 'd.': 46126,\n",
       " '?t': 35970,\n",
       " 'yp': 93853,\n",
       " 'pe': 73644,\n",
       " 'e=': 49103,\n",
       " '=t': 34914,\n",
       " 'to': 83431,\n",
       " 'or': 72042,\n",
       " 'rr': 78540,\n",
       " 're': 77834,\n",
       " 't&': 81635,\n",
       " '&f': 4203,\n",
       " 'fi': 52056,\n",
       " '=.': 34212,\n",
       " '..': 9945,\n",
       " './': 9970,\n",
       " '/.': 10795,\n",
       " 'tc': 82810,\n",
       " 'c/': 43793,\n",
       " '/p': 12239,\n",
       " 'pa': 73442,\n",
       " 'sw': 81273,\n",
       " 'wd': 89004,\n",
       " 'd\\n': 45830,\n",
       " '/cg': 11672,\n",
       " 'cgi': 44796,\n",
       " 'gi-': 54231,\n",
       " 'i-b': 57406,\n",
       " '-bi': 8896,\n",
       " 'bin': 42653,\n",
       " 'in/': 58740,\n",
       " 'n/b': 68270,\n",
       " '/bt': 11635,\n",
       " 'btd': 43204,\n",
       " 'tdo': 82903,\n",
       " 'dow': 47595,\n",
       " 'own': 72363,\n",
       " 'wnl': 89534,\n",
       " 'nlo': 69537,\n",
       " 'loa': 65086,\n",
       " 'oad': 71136,\n",
       " 'ad.': 40105,\n",
       " 'd.p': 46145,\n",
       " 'hp?': 56695,\n",
       " 'p?t': 73354,\n",
       " '?ty': 35986,\n",
       " 'typ': 84007,\n",
       " 'ype': 93877,\n",
       " 'pe=': 73669,\n",
       " 'e=t': 49157,\n",
       " '=to': 34923,\n",
       " 'tor': 83481,\n",
       " 'orr': 72093,\n",
       " 'rre': 78564,\n",
       " 'ren': 77883,\n",
       " 'nt&': 69912,\n",
       " 't&f': 81647,\n",
       " '&fi': 4209,\n",
       " 'fil': 52091,\n",
       " 'ile': 58647,\n",
       " 'le=': 64546,\n",
       " 'e=.': 49115,\n",
       " '=..': 34217,\n",
       " '../': 9951,\n",
       " './.': 9974,\n",
       " '/..': 10802,\n",
       " './e': 9981,\n",
       " '/et': 11777,\n",
       " 'etc': 50425,\n",
       " 'tc/': 82820,\n",
       " 'c/p': 43823,\n",
       " '/pa': 12260,\n",
       " 'pas': 73488,\n",
       " 'ass': 40958,\n",
       " 'ssw': 81101,\n",
       " 'swd': 81300,\n",
       " 'wd\\n': 89006,\n",
       " '5': 23159,\n",
       " '/a': 11544,\n",
       " 'ap': 40730,\n",
       " 'vx': 87973,\n",
       " 'ug': 85078,\n",
       " 'gx': 54969,\n",
       " 'x.': 90372,\n",
       " '.h': 10288,\n",
       " 'tm': 83332,\n",
       " 'ml': 67153,\n",
       " 'l?': 64187,\n",
       " 'dn': 47496,\n",
       " 'nb': 68952,\n",
       " 'bh': 42568,\n",
       " 'h=': 55811,\n",
       " '=5': 34380,\n",
       " '52': 23526,\n",
       " '23': 17587,\n",
       " '37': 19832,\n",
       " '7\"': 27087,\n",
       " '/ap': 11584,\n",
       " 'apa': 40760,\n",
       " 'pav': 73491,\n",
       " 'avx': 41126,\n",
       " 'vxu': 88014,\n",
       " 'xug': 92030,\n",
       " 'ugx': 85125,\n",
       " 'gx.': 54978,\n",
       " 'x.h': 90387,\n",
       " '.ht': 10297,\n",
       " 'htm': 56913,\n",
       " 'tml': 83366,\n",
       " 'ml?': 67184,\n",
       " 'l?<': 64200,\n",
       " 'std': 81151,\n",
       " 'tdn': 82902,\n",
       " 'dnb': 47522,\n",
       " 'nbh': 68981,\n",
       " 'bh=': 42586,\n",
       " 'h=5': 55824,\n",
       " '=52': 34386,\n",
       " '523': 23542,\n",
       " '237': 17608,\n",
       " '37\"': 19835,\n",
       " '7\">': 27092,\n",
       " '+a': 7552,\n",
       " 'dw': 47954,\n",
       " 'w-': 88271,\n",
       " '-s': 9551,\n",
       " 't+': 81773,\n",
       " 'd4': 46380,\n",
       " '4-': 21372,\n",
       " '-a': 8831,\n",
       " 'al': 40504,\n",
       " 'er': 50255,\n",
       " 'rt': 78647,\n",
       " 't(': 81683,\n",
       " '(1': 5268,\n",
       " '1)': 15136,\n",
       " ');': 5964,\n",
       " ';+': 33164,\n",
       " '-/': 8408,\n",
       " '-\\n': 8263,\n",
       " '+ad': 7561,\n",
       " 'adw': 40146,\n",
       " 'dw-': 47962,\n",
       " 'w-s': 88302,\n",
       " '-sc': 9568,\n",
       " 'pt+': 74429,\n",
       " 't+a': 81783,\n",
       " 'ad4': 40111,\n",
       " 'd4-': 46386,\n",
       " '4-a': 21391,\n",
       " '-al': 8860,\n",
       " 'ale': 40542,\n",
       " 'ler': 64571,\n",
       " 'ert': 50315,\n",
       " 'rt(': 78657,\n",
       " 't(1': 81696,\n",
       " '(1)': 5272,\n",
       " '1);': 15152,\n",
       " ');+': 5977,\n",
       " ';+a': 33166,\n",
       " 'w-/': 88273,\n",
       " '-/s': 8421,\n",
       " '4-\\n': 21373,\n",
       " ':': 32669,\n",
       " '//': 10832,\n",
       " '/<': 11364,\n",
       " '<b': 33798,\n",
       " 'od': 71269,\n",
       " 'dy': 48053,\n",
       " 'y>': 93052,\n",
       " '><': 35256,\n",
       " 't ': 81517,\n",
       " ' s': 1728,\n",
       " 'sr': 80991,\n",
       " 'rc': 77717,\n",
       " 'c=': 44293,\n",
       " '=h': 34734,\n",
       " 'p:': 73244,\n",
       " ':/': 32759,\n",
       " '/w': 12561,\n",
       " 'w.': 88310,\n",
       " 'e.': 48522,\n",
       " '.c': 10211,\n",
       " 'om': 71740,\n",
       " 'm/': 65957,\n",
       " '/t': 12423,\n",
       " 't?': 82571,\n",
       " '?r': 35933,\n",
       " 'rn': 78333,\n",
       " 'nd': 69053,\n",
       " 'd=': 46686,\n",
       " '=q': 34857,\n",
       " 'q7': 75230,\n",
       " '77': 27631,\n",
       " '74': 27465,\n",
       " '44': 21690,\n",
       " '47': 21852,\n",
       " '73': 27409,\n",
       " '1>': 15992,\n",
       " '//<': 10861,\n",
       " '/<b': 11373,\n",
       " '<bo': 33807,\n",
       " 'bod': 42946,\n",
       " 'ody': 71322,\n",
       " 'dy>': 48078,\n",
       " 'y><': 93056,\n",
       " '><s': 35288,\n",
       " 'pt ': 74420,\n",
       " 't s': 81568,\n",
       " ' sr': 1765,\n",
       " 'src': 81017,\n",
       " 'rc=': 77742,\n",
       " 'c=h': 44327,\n",
       " '=ht': 34746,\n",
       " 'tp:': 83512,\n",
       " 'p:/': 73246,\n",
       " '://': 32765,\n",
       " '//w': 10893,\n",
       " '/ww': 12601,\n",
       " 'www': 89996,\n",
       " 'ww.': 89959,\n",
       " 'w.e': 88318,\n",
       " 'le.': 64531,\n",
       " 'e.c': 48532,\n",
       " '.co': 10226,\n",
       " 'com': 45217,\n",
       " 'om/': 71754,\n",
       " 'm/t': 65991,\n",
       " '/te': 12448,\n",
       " 'st?': 81142,\n",
       " 't?r': 82598,\n",
       " '?rn': 35939,\n",
       " 'rnd': 78364,\n",
       " 'nd=': 69085,\n",
       " 'd=q': 46732,\n",
       " '=q7': 34865,\n",
       " 'q77': 75240,\n",
       " '774': 27647,\n",
       " '744': 27485,\n",
       " '447': 21710,\n",
       " '473': 21866,\n",
       " '737': 27429,\n",
       " '371': 19847,\n",
       " '71>': 27328,\n",
       " '1><': 15997,\n",
       " '></': 35266,\n",
       " '0': 12808,\n",
       " '/?': 11410,\n",
       " 'iu': 59119,\n",
       " 'ul': 85326,\n",
       " 'lz': 65621,\n",
       " '=3': 34340,\n",
       " '70': 27248,\n",
       " '09': 13652,\n",
       " '9\"': 30813,\n",
       " '/?<': 11415,\n",
       " 'sti': 81156,\n",
       " 'tiu': 83180,\n",
       " 'iul': 59152,\n",
       " 'ulz': 85376,\n",
       " 'lz=': 65640,\n",
       " 'z=3': 94962,\n",
       " '=37': 34354,\n",
       " '370': 19846,\n",
       " '709': 27272,\n",
       " '09\"': 13657,\n",
       " '9\">': 30817,\n",
       " '|': 96441,\n",
       " '[': 36168,\n",
       " '\\\\': 36509,\n",
       " ']': 36846,\n",
       " '| ': 96443,\n",
       " ' [': 969,\n",
       " '[ ': 36172,\n",
       " ' 1': 632,\n",
       " '1 ': 14978,\n",
       " ' -': 550,\n",
       " '-n': 9354,\n",
       " ' $': 400,\n",
       " ' p': 1613,\n",
       " 'ef': 49600,\n",
       " 'fr': 52497,\n",
       " 'rd': 77774,\n",
       " 'd ': 45842,\n",
       " ' |': 2058,\n",
       " ' t': 1774,\n",
       " 'tr': 83590,\n",
       " 'r ': 76656,\n",
       " '-d': 8956,\n",
       " \" '\": 443,\n",
       " \"'\\\\\": 4787,\n",
       " '\\\\n': 36752,\n",
       " \"n'\": 68059,\n",
       " \"' \": 4431,\n",
       " ' w': 1891,\n",
       " 'wc': 88953,\n",
       " 'c ': 43542,\n",
       " 'c)': 43657,\n",
       " ') ': 5702,\n",
       " ' ]': 1012,\n",
       " '] ': 36850,\n",
       " '||': 96561,\n",
       " 'ee': 49539,\n",
       " 'ep': 50150,\n",
       " 'p ': 72553,\n",
       " ' \\\\': 1001,\n",
       " '\\\\\\\\': 36658,\n",
       " '\\\\\\n': 36510,\n",
       " '| [': 96446,\n",
       " ' [ ': 972,\n",
       " '[ 1': 36173,\n",
       " ' 1 ': 634,\n",
       " '1 -': 14987,\n",
       " ' -n': 574,\n",
       " '-ne': 9374,\n",
       " 'ne ': 69119,\n",
       " 'e $': 48195,\n",
       " ' $(': 403,\n",
       " 'o p': 70328,\n",
       " ' pe': 1627,\n",
       " 'pef': 73680,\n",
       " 'efr': 49646,\n",
       " 'frr': 52538,\n",
       " 'rrd': 78563,\n",
       " 'rd ': 77776,\n",
       " 'd |': 45896,\n",
       " ' | ': 2060,\n",
       " '| t': 96454,\n",
       " ' tr': 1800,\n",
       " 'tr ': 83592,\n",
       " 'r -': 76666,\n",
       " ' -d': 564,\n",
       " '-d ': 8957,\n",
       " \"d '\": 45848,\n",
       " \" '\\\\\": 465,\n",
       " \"'\\\\n\": 4795,\n",
       " \"\\\\n'\": 36755,\n",
       " \"n' \": 68061,\n",
       " \"' |\": 4467,\n",
       " '| w': 96455,\n",
       " ' wc': 1903,\n",
       " 'wc ': 88955,\n",
       " 'c -': 43545,\n",
       " ' -c': 563,\n",
       " '-c)': 8916,\n",
       " 'c) ': 43658,\n",
       " ') ]': 5724,\n",
       " ' ] ': 1014,\n",
       " '] |': 36875,\n",
       " ' ||': 2063,\n",
       " '|| ': 96563,\n",
       " '| s': 96453,\n",
       " ' sl': 1759,\n",
       " 'sle': 80708,\n",
       " 'lee': 64558,\n",
       " 'eep': 49587,\n",
       " 'ep ': 50152,\n",
       " 'p 1': 72559,\n",
       " '1 \\\\': 15003,\n",
       " ' \\\\\\\\': 1006,\n",
       " '\\\\\\\\\\n': 36659,\n",
       " 'r.': 76941,\n",
       " 't/p': 81934,\n",
       " 'pap': 73485,\n",
       " 'ape': 40764,\n",
       " 'per': 73692,\n",
       " 'er.': 50273,\n",
       " 'r.e': 76949,\n",
       " '/g': 11830,\n",
       " 'g8': 53583,\n",
       " '8n': 30279,\n",
       " 'nu': 69969,\n",
       " 'u2': 84384,\n",
       " '2y': 19107,\n",
       " 'yy': 94288,\n",
       " 'y7': 92880,\n",
       " '7.': 27203,\n",
       " '.f': 10265,\n",
       " 'ft': 52600,\n",
       " 'ts': 83646,\n",
       " 's?': 79954,\n",
       " '<i': 33840,\n",
       " 'im': 58670,\n",
       " 'mg': 66897,\n",
       " 'g ': 52965,\n",
       " '\"j': 3037,\n",
       " 't:': 82371,\n",
       " ':a': 32881,\n",
       " '(c': 5460,\n",
       " ';\"': 33072,\n",
       " '/g8': 11845,\n",
       " 'g8n': 53610,\n",
       " '8nu': 30312,\n",
       " 'nu2': 69981,\n",
       " 'u2y': 84422,\n",
       " '2yy': 19143,\n",
       " 'yy7': 94305,\n",
       " 'y7.': 92882,\n",
       " '7.f': 27216,\n",
       " '.ft': 10277,\n",
       " 'fts': 52647,\n",
       " 'ts?': 83677,\n",
       " 's?<': 79959,\n",
       " '?<i': 35620,\n",
       " '<im': 33846,\n",
       " 'img': 58704,\n",
       " 'mg ': 66899,\n",
       " 'g s': 53005,\n",
       " 'c=\"': 44296,\n",
       " '=\"j': 34040,\n",
       " '\"ja': 3040,\n",
       " 'pt:': 74444,\n",
       " 't:a': 82392,\n",
       " ':al': 32886,\n",
       " 't(c': 81711,\n",
       " '(cr': 5470,\n",
       " 'sl)': 80680,\n",
       " 'l);': 63503,\n",
       " ');\"': 5969,\n",
       " ';\">': 33081,\n",
       " '/\"': 10581,\n",
       " '>\\\\': 35337,\n",
       " '\\\\x': 36814,\n",
       " 'xd': 91170,\n",
       " 'd0': 46195,\n",
       " '0\\\\': 13792,\n",
       " 'xb': 91075,\n",
       " 'bc': 42308,\n",
       " 'c\\\\': 44370,\n",
       " 'd1': 46238,\n",
       " '1\\\\': 16012,\n",
       " 'x8': 90770,\n",
       " '83': 29275,\n",
       " '3\\\\': 20070,\n",
       " 'b7': 41924,\n",
       " '7\\\\': 27825,\n",
       " '8b': 29792,\n",
       " 'b\\\\': 42138,\n",
       " 'ba': 42204,\n",
       " 'a\\\\': 39850,\n",
       " 'b0': 41612,\n",
       " '0<': 13753,\n",
       " '/\\n': 10528,\n",
       " '/\">': 10590,\n",
       " '\">\\\\': 2892,\n",
       " '>\\\\x': 35341,\n",
       " '\\\\xd': 36831,\n",
       " 'xd0': 91179,\n",
       " 'd0\\\\': 46213,\n",
       " '0\\\\x': 13799,\n",
       " '\\\\xb': 36829,\n",
       " 'xbc': 91095,\n",
       " 'bc\\\\': 42334,\n",
       " 'c\\\\x': 44383,\n",
       " 'xd1': 91180,\n",
       " 'd1\\\\': 46258,\n",
       " '1\\\\x': 16022,\n",
       " '\\\\x8': 36826,\n",
       " 'x83': 90778,\n",
       " '83\\\\': 29303,\n",
       " '3\\\\x': 20074,\n",
       " 'xb7': 91089,\n",
       " 'b7\\\\': 41939,\n",
       " '7\\\\x': 27830,\n",
       " 'x8b': 90787,\n",
       " '8b\\\\': 29809,\n",
       " 'b\\\\x': 42146,\n",
       " 'xba': 91093,\n",
       " 'ba\\\\': 42228,\n",
       " 'a\\\\x': 39862,\n",
       " 'xb0': 91082,\n",
       " 'b0<': 41632,\n",
       " '0</': 13755,\n",
       " '</\\n': 33699,\n",
       " '*': 6910,\n",
       " '/f': 11784,\n",
       " 'fo': 52345,\n",
       " 'ru': 78713,\n",
       " 'm.': 65928,\n",
       " '?i': 35825,\n",
       " 'id': 58174,\n",
       " 'd_': 46784,\n",
       " '_a': 37723,\n",
       " 'ar': 40837,\n",
       " 'ic': 58119,\n",
       " 'cl': 45030,\n",
       " '=1': 34280,\n",
       " '1&': 15082,\n",
       " '&i': 4240,\n",
       " '_f': 37937,\n",
       " 'm=': 66435,\n",
       " '=-': 34197,\n",
       " '-1': 8452,\n",
       " '1/': 15308,\n",
       " '/*': 10706,\n",
       " '**': 6990,\n",
       " '*/': 7022,\n",
       " '/u': 12470,\n",
       " 'un': 85433,\n",
       " 'ni': 69339,\n",
       " 'io': 58787,\n",
       " 'el': 49915,\n",
       " 'ct': 45438,\n",
       " ' 2': 672,\n",
       " '20': 17418,\n",
       " '08': 13599,\n",
       " '80': 29118,\n",
       " '03': 13329,\n",
       " '75': 27524,\n",
       " '29': 17926,\n",
       " '9-': 30920,\n",
       " '--': 8378,\n",
       " 'pts': 74474,\n",
       " 'ts/': 83663,\n",
       " 's/f': 79407,\n",
       " '/fo': 11818,\n",
       " 'for': 52389,\n",
       " 'oru': 72096,\n",
       " 'rum': 78750,\n",
       " 'um.': 85387,\n",
       " 'm.p': 65946,\n",
       " '3?i': 20057,\n",
       " '?id': 35829,\n",
       " 'id_': 58209,\n",
       " 'd_a': 46801,\n",
       " '_ar': 37758,\n",
       " 'art': 40893,\n",
       " 'rti': 78693,\n",
       " 'tic': 83162,\n",
       " 'icl': 58158,\n",
       " 'cle': 45058,\n",
       " 'e=1': 49118,\n",
       " '=1&': 34285,\n",
       " '1&i': 15096,\n",
       " '&id': 4242,\n",
       " 'd_f': 46806,\n",
       " '_fo': 37965,\n",
       " 'um=': 85401,\n",
       " 'm=-': 66439,\n",
       " '=-1': 34200,\n",
       " '-1/': 8462,\n",
       " ...}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec.vocabulary_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# First, we create a token vocabulary dictionary so that\n",
    "# we can access tokens by index.\n",
    "vocab = dict([(v,k) for k,v in vec.vocabulary_.items()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "term_idx = np.argpartition(term_influence, 1)[0][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t/s\n"
     ]
    }
   ],
   "source": [
    "print(vocab[term_idx])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload = \"<script>alert(1)</script>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.predict([payload])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  1.86163618e-09,   9.99999998e-01])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.predict_proba([payload])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  1.83734699e-07,   9.99999816e-01])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.predict_proba([payload + '/' + vocab[term_idx]])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.50142443,  0.49857557])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.predict_proba([payload + '/' + vocab[term_idx]*258])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p.predict([payload + '/' + vocab[term_idx]*258])[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<script>alert(1)</script>/t/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/st/s\n"
     ]
    }
   ],
   "source": [
    "print(payload + '/' + vocab[term_idx]*258)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
